import threading
import pandas as pd
import numpy as np
from spider.SingleArticleSpider import SingleArticleSpider


def worker(small_df, threed_mark):
    count = 1
    total = len(small_df)
    spider = SingleArticleSpider()

    for index in range(len(small_df)):
        # spider = SingleArticleSpider()

        if "NOTFOUND" == small_df["url"][index]:
            print("线程：" + threed_mark + " 当前解析位置：第 " + str(count) + "/" + str(total) + " 条")
            print("广告链接或错误数据：NOTFOUND跳过")
        else:
            print("线程：" + threed_mark + " 当前解析位置：第 " + str(count) + "/" + str(total) + " 条" + "标题为：" +
                  small_df["title"][index] + "链接为：" + small_df["url"][index])
            spider.run(small_df["url"][index], threed_mark)

        count = count + 1


class ThreedAccelerateSpider(object):
    # 初始化
    # 定义初始页面url
    def __init__(self):
        # 大的DataFrame用来存储读取的csv数据
        self.df = None
        # 创建一个空列表来存储子DataFrame
        self.dfs = []

    def loadData(self):
        # 读取你的DataFrame
        self.df = pd.read_csv('./csv_collect/big_articles.csv')
        print(type(self.df))
        # 获取标题行
        header = self.df.iloc[0]

        # 切割DataFrame
        split_dfs = np.array_split(self.df[1:], 20)  # 从第二行开始切割，因为第一行是标题行

        for subset in split_dfs:
            # 在每个子DataFrame的顶部添加标题行
            subset = pd.concat([pd.DataFrame(header).T, subset])
            self.dfs.append(subset)
        # 现在，dfs是一个列表，包含了20个子DataFrame，每个子DataFrame的第一行都是标题行

        print(self.dfs[0])
        print(self.dfs[1])
        print(self.dfs[2])

    def run(self):
        # 加载数据
        self.loadData()

        threads = []

        # 多重影分身术 启动!
        for i in range(len(self.dfs)):
            threed_mark = str(i)
            small_df = self.dfs[i]
            # print(small_df)
            t = threading.Thread(target=worker, args=(small_df, threed_mark))
            t.start()
            threads.append(t)

        # 等待所有线程完成
        for t in threads:
            t.join()

        # 当你到达这里时，所有的子线程都已经完成
        print("所有的子线程都已经完成")

# 以脚本方式启动
if __name__ == "__main__":
    # 捕捉异常错误
    try:
        spider = ThreedAccelerateSpider()
        spider.loadData()
    except Exception as e:
        print("错误:", e)







